In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
In [2]:
train = pd.read_csv('d:/pydoc/personalised_learning/train.csv')
In [3]:
train_original=train.copy()
In [4]:
train.shape
Out[4]:
(31962, 3)
In [5]:
train_original
Out[5]:
id label tweet
0 1 0 @user when a father is dysfunctional and is s...
1 2 0 @user @user thanks for #lyft credit i can't us...
2 3 0 bihday your majesty
3 4 0 #model i love u take with u all the time in ...
4 5 0 factsguide: society now #motivation
... ... ... ...
31957 31958 0 ate @user isz that youuu?😍😍😍😍😍ð...
31958 31959 0 to see nina turner on the airwaves trying to...
31959 31960 0 listening to sad songs on a monday morning otw...
31960 31961 1 @user #sikh #temple vandalised in in #calgary,...
31961 31962 0 thank you @user for you follow

31962 rows × 3 columns

In [6]:
test = pd.read_csv('d:/pydoc/personalised_learning/test.csv')
In [7]:
test_original=test.copy()
In [8]:
test.shape
Out[8]:
(17197, 2)
In [9]:
test_original
Out[9]:
id tweet
0 31963 #studiolife #aislife #requires #passion #dedic...
1 31964 @user #white #supremacists want everyone to s...
2 31965 safe ways to heal your #acne!! #altwaystohe...
3 31966 is the hp and the cursed child book up for res...
4 31967 3rd #bihday to my amazing, hilarious #nephew...
... ... ...
17192 49155 thought factory: left-right polarisation! #tru...
17193 49156 feeling like a mermaid 😘 #hairflip #neverre...
17194 49157 #hillary #campaigned today in #ohio((omg)) &am...
17195 49158 happy, at work conference: right mindset leads...
17196 49159 my song "so glad" free download! #shoegaze ...

17197 rows × 2 columns

In [10]:
combine = train.append(test,ignore_index=True,sort=True)
In [11]:
combine.head()
Out[11]:
id label tweet
0 1 0.0 @user when a father is dysfunctional and is s...
1 2 0.0 @user @user thanks for #lyft credit i can't us...
2 3 0.0 bihday your majesty
3 4 0.0 #model i love u take with u all the time in ...
4 5 0.0 factsguide: society now #motivation
In [12]:
combine.tail()
Out[12]:
id label tweet
49154 49155 NaN thought factory: left-right polarisation! #tru...
49155 49156 NaN feeling like a mermaid 😘 #hairflip #neverre...
49156 49157 NaN #hillary #campaigned today in #ohio((omg)) &am...
49157 49158 NaN happy, at work conference: right mindset leads...
49158 49159 NaN my song "so glad" free download! #shoegaze ...
In [13]:
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text
In [14]:
#Removing Twitter Handles (@user)
combine['Tidy_Tweets'] = np.vectorize(remove_pattern)(combine['tweet'], "@[\w]*")

combine.head()
Out[14]:
id label tweet Tidy_Tweets
0 1 0.0 @user when a father is dysfunctional and is s... when a father is dysfunctional and is so sel...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks for #lyft credit i can't use cause th...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model i love u take with u all the time in ...
4 5 0.0 factsguide: society now #motivation factsguide: society now #motivation
In [15]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].str.replace("[^a-zA-Z#]", " ")
In [16]:
combine.head(10)
Out[16]:
id label tweet Tidy_Tweets
0 1 0.0 @user when a father is dysfunctional and is s... when a father is dysfunctional and is so sel...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks for #lyft credit i can t use cause th...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model i love u take with u all the time in ...
4 5 0.0 factsguide: society now #motivation factsguide society now #motivation
5 6 0.0 [2/2] huge fan fare and big talking before the... huge fan fare and big talking before the...
6 7 0.0 @user camping tomorrow @user @user @user @use... camping tomorrow danny
7 8 0.0 the next school year is the year for exams.ðŸ˜... the next school year is the year for exams ...
8 9 0.0 we won!!! love the land!!! #allin #cavs #champ... we won love the land #allin #cavs #champ...
9 10 0.0 @user @user welcome here ! i'm it's so #gr... welcome here i m it s so #gr
In [17]:
#Removing Short Words
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(lambda x: ' '
                                                      .join([w for w in x.split() if len(w)>3]))

combine.head(10)
Out[17]:
id label tweet Tidy_Tweets
0 1 0.0 @user when a father is dysfunctional and is s... when father dysfunctional selfish drags kids i...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thanks #lyft credit cause they offer wheelchai...
2 3 0.0 bihday your majesty bihday your majesty
3 4 0.0 #model i love u take with u all the time in ... #model love take with time
4 5 0.0 factsguide: society now #motivation factsguide society #motivation
5 6 0.0 [2/2] huge fan fare and big talking before the... huge fare talking before they leave chaos disp...
6 7 0.0 @user camping tomorrow @user @user @user @use... camping tomorrow danny
7 8 0.0 the next school year is the year for exams.ðŸ˜... next school year year exams think about that #...
8 9 0.0 we won!!! love the land!!! #allin #cavs #champ... love land #allin #cavs #champions #cleveland #...
9 10 0.0 @user @user welcome here ! i'm it's so #gr... welcome here
In [18]:
#Tokenization

#Now we will tokenize all the cleaned tweets in our dataset. 
#Tokens are individual terms or words, and tokenization is the
#process of splitting a string of text into tokens.
tokenized_tweet = combine['Tidy_Tweets'].apply(lambda x: x.split())
tokenized_tweet.head()
Out[18]:
0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: Tidy_Tweets, dtype: object
In [19]:
#Stemming

#Stemming is a rule-based process of stripping the suffixes 
#(“ing”, “ly”, “es”, “s” etc) from a word. For example, For example – “play”, 
#“player”, “played”, “plays” and “playing” are the different variations of the word – “play”

from nltk import PorterStemmer

ps = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

tokenized_tweet.head()
Out[19]:
0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: Tidy_Tweets, dtype: object
In [20]:
# Now let’s stitch these tokens back together.

for i in range (len(tokenized_tweet)):
      tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
combine['Tidy_Tweets'] = tokenized_tweet
combine.head()
Out[20]:
id label tweet Tidy_Tweets
0 1 0.0 @user when a father is dysfunctional and is s... when father dysfunct selfish drag kid into dys...
1 2 0.0 @user @user thanks for #lyft credit i can't us... thank #lyft credit caus they offer wheelchair ...
2 3 0.0 bihday your majesty bihday your majesti
3 4 0.0 #model i love u take with u all the time in ... #model love take with time
4 5 0.0 factsguide: society now #motivation factsguid societi #motiv
In [21]:
# Importing Packages necessary for generating a WordCloud
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests
In [22]:
#Store all the words from the dataset which are non-racist/sexist
all_words_positive = ' '.join(text for text in combine['Tidy_Tweets'][combine['label']==0])
In [23]:
#We can see most of the words are positive or neutral. With happy,
#smile, and love being the most frequent ones. Hence, most of the
#frequent words are compatible with the sentiment which is non racist/sexists tweets.
# combining the image with the dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

# We use the ImageColorGenerator library from Wordcloud 
# Here we take the color of the image and impose it over our wordcloud
image_colors = ImageColorGenerator(Mask)

# Now we use the WordCloud function from the wordcloud library 
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(all_words_positive)

# Size of the image generated 
plt.figure(figsize=(10,20))

# Here we recolor the words from the dataset to the image's color
# recolor just recolors the default colors to the image's blue color
# interpolation is used to smooth the image generated 
plt.imshow(wc.recolor(color_func=image_colors),interpolation="hamming")

plt.axis('off')
plt.show()
In [24]:
 all_words_negative = ' '.join(text for text in combine['Tidy_Tweets'][combine['label']==1])
In [25]:
#As we can clearly see, most of 
#the words have negative connotations.
#So, it seems we have a pretty good text data to work on.
# combining the image with the dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

# We use the ImageColorGenerator library from Wordcloud 
# Here we take the color of the image and impose it over our wordcloud
image_colors = ImageColorGenerator(Mask)

# Now we use the WordCloud function from the wordcloud library 
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(all_words_negative)

# Size of the image generated 
plt.figure(figsize=(10,20))

# Here we recolor the words from the dataset to the image's color
# recolor just recolors the default colors to the image's blue color
# interpolation is used to smooth the image generated 
plt.imshow(wc.recolor(color_func=image_colors),interpolation="gaussian")

plt.axis('off')
plt.show()
In [26]:
#Function to extract hashtags from tweets
def Hashtags_Extract(x):
    hashtags = []
    
    # Loop over the words in the tweets
    for i in x:
        ht = re.findall(r'#(\w+)',i)
        hashtags.append(ht)
                       
                       
    return hashtags
                       
                       
In [27]:
# A nested list of all the hastags from the positive
# reviews from the datsets
# 0 means positive tweets
ht_positive = Hashtags_Extract(combine['Tidy_Tweets'][combine['label']==0])
In [28]:
ht_positive_unnest = sum(ht_positive,[])
In [29]:
## A Nested list of all the hashtags from the negative
# reviews from the datasets
# 1 means negative tweets
ht_negative = Hashtags_Extract(combine['Tidy_Tweets'][combine['label']==1])
In [30]:
ht_negative_unnest = sum(ht_negative,[])
In [31]:
#Plotting graphs
# For Positive datasets in the tweets
# counting the frequency of the 
# words having Positive Sentiment
In [32]:
word_freq_positive = nltk.FreqDist(ht_positive_unnest)

word_freq_positive
Out[32]:
FreqDist({'love': 1654, 'posit': 917, 'smile': 676, 'healthi': 573, 'thank': 534, 'fun': 463, 'life': 425, 'affirm': 423, 'summer': 390, 'model': 375, ...})
In [33]:
df_positive = pd.DataFrame({'Hashtags':list(word_freq_positive.keys()),'Count':list(word_freq_positive.values())})
In [34]:
df_positive.head(10)
Out[34]:
Hashtags Count
0 run 72
1 lyft 2
2 disapoint 1
3 getthank 2
4 model 375
5 motiv 202
6 allshowandnogo 1
7 school 30
8 exam 9
9 hate 27
In [35]:
#Plotting the barplot for the 10
# most frequent words used for
#hashtags

df_positive_plot = df_positive.nlargest(20,columns='Count') 
In [36]:
import seaborn as sns
sns.barplot(data=df_positive_plot,y='Hashtags',x='Count')
sns.despine()
In [37]:
import plotly.express as px
df = df_positive_plot
px.scatter(df, x="Count", y="Hashtags")
In [38]:
import plotly.express as px
df = df_positive_plot
fig = px.pie(df, values='Count', names='Hashtags', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
In [39]:
# for negative tweets in datasets
#Counting the frequency of the words having Negative Sentiment
word_freq_negative = nltk.FreqDist(ht_negative_unnest)
In [40]:
word_freq_negative
Out[40]:
FreqDist({'trump': 136, 'polit': 95, 'allahsoil': 92, 'liber': 81, 'libtard': 77, 'sjw': 75, 'retweet': 63, 'black': 46, 'miami': 46, 'hate': 37, ...})
In [41]:
#Creating a dataframe for the most frequently used words in hashtags
df_negative = pd.DataFrame({'Hashtags':list(word_freq_negative.keys()),'Count':list(word_freq_negative.values())})
In [42]:
df_negative.head(10)
Out[42]:
Hashtags Count
0 cnn 10
1 michigan 2
2 tcot 14
3 australia 6
4 opkillingbay 5
5 seashepherd 22
6 helpcovedolphin 3
7 thecov 4
8 neverump 8
9 xenophobia 12
In [43]:
#Plotting the barplot for the 10 most frequent words used for hashtags
df_negative_plot = df_negative.nlargest(20,columns='Count') 
In [44]:
sns.barplot(data=df_negative_plot,y='Hashtags',x='Count')
sns.despine()
In [45]:
import plotly.express as px
df = df_negative_plot
fig = px.pie(df, values='Count', names='Hashtags', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
In [46]:
#Extracting Features from cleaned Tweets
#Bag-of-Words Features

#Bag of Words is a method to extract features from text
#documents. These features can be used for training machine 
#learning algorithms. It creates a vocabulary of
#all the unique words occurring in all the documents
#in the training set.

#Consider a corpus (a collection of texts) called C of D documents {d1,d2…..dD} and N unique tokens extracted out of the corpus C. The N tokens (words) will form a list, and the size of the bag-of-words matrix M will be given by D X N. Each row in the matrix M contains the frequency of tokens in document D(i).

#For example, if you have 2 documents-

    #D1: He is a lazy boy. She is also lazy.

    #D2: Smith is a lazy person.

#First, it creates a vocabulary using unique words from all the documents
#[‘He’ , ’She’ , ’lazy’ , 'boy’ , 'Smith’ , ’person’]

   # Here, D=2, N=6

    #The matrix M of size 2 X 6 will be represented as:

#bow

#The above table depicts the training features 
#containing term frequencies of each word in each document. 
#This is called bag-of-words approach since the number of 
#occurrence and not sequence or order of words matters in this approach.


from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(combine['Tidy_Tweets'])

df_bow = pd.DataFrame(bow.todense())

df_bow
Out[46]:
0 1 2 3 4 5 6 7 8 9 ... 990 991 992 993 994 995 996 997 998 999
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49154 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
49155 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
49156 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
49157 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
49158 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

49159 rows × 1000 columns

In [47]:
#TF-IDF Features

#Tf-idf stands for term frequency-inverse document frequency, and the tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

#Typically, the tf-idf weight is composed by two terms: the first computes the normalized Term Frequency (TF), aka. the number of times a word appears in a document, divided by the total number of words in that document; the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

    #TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:
    #TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

   # IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
    #IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
    
    
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.90, min_df=2,max_features=1000,stop_words='english')

tfidf_matrix=tfidf.fit_transform(combine['Tidy_Tweets'])

df_tfidf = pd.DataFrame(tfidf_matrix.todense())

df_tfidf  
    
Out[47]:
0 1 2 3 4 5 6 7 8 9 ... 990 991 992 993 994 995 996 997 998 999
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
49154 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49155 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49156 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49157 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49158 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

49159 rows × 1000 columns

In [48]:
#Applying Machine Learning Models
#Using the features from Bag-of-Words Model for training set
train_bow = bow[:31962]

train_bow.todense()
Out[48]:
matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [49]:
#Using features from TF-IDF for training set
train_tfidf_matrix = tfidf_matrix[:31962]

train_tfidf_matrix.todense()
Out[49]:
matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])
In [50]:
#Splitting the data into training and validation set
from sklearn.model_selection import train_test_split
In [51]:
#Bag-of-Words Features
x_train_bow,x_valid_bow,y_train_bow,y_valid_bow = train_test_split(train_bow,train['label'],test_size=0.3,random_state=2)
In [52]:
#Using TF-IDF features
x_train_tfidf,x_valid_tfidf,y_train_tfidf,y_valid_tfidf = train_test_split(train_tfidf_matrix,train['label'],test_size=0.3,random_state=17)
In [53]:
#Random Forest Algorithm for Bag-Of_Words  Features
#Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_mean=False)
x_train_bow = sc.fit_transform(x_train_bow)
x_valid_bow = sc.transform(x_valid_bow)
In [54]:
#print(X_train,"",X_test)
In [55]:
# train the  Bag-0f-words algorithm
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_train_bow, y_train_bow)
y_pred = regressor.predict(x_valid_bow)
In [56]:
#Evaluating the Algorithm
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid_bow, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid_bow, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid_bow, y_pred)))
Mean Absolute Error: 0.0978415597848816
Mean Squared Error: 0.05258016233627957
Root Mean Squared Error: 0.2293036465830397
In [57]:
#Using Random Forest for Classification evalaute the Bag-Of-Words
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score
score_bow = accuracy_score(y_valid_bow, y_pred.round())
print(confusion_matrix(y_valid_bow,y_pred.round()))
print(classification_report(y_valid_bow,y_pred.round()))
print(score_bow)
[[8585  325]
 [ 313  366]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      8910
           1       0.53      0.54      0.53       679

    accuracy                           0.93      9589
   macro avg       0.75      0.75      0.75      9589
weighted avg       0.93      0.93      0.93      9589

0.9334654291375535
In [58]:
# The first part of the list is predicting probabilities for label:0 
# and the second part of the list is predicting probabilities for label:1
rdf_tfidf=regressor.predict(x_valid_bow)

rdf_tfidf
Out[58]:
array([0.        , 0.        , 0.04766031, ..., 0.55      , 0.        ,
       0.        ])
In [59]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
prediction_int = rdf_tfidf[:]>=1

prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_bow = f1_score(y_valid_bow, prediction_int)

log_bow
Out[59]:
0.3255813953488372
In [60]:
#Random Forest Algorithm for TF/IDF  Features
#Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_mean=False)
x_train_tfidf = sc.fit_transform(x_train_tfidf)
x_valid_tfidf = sc.transform(x_valid_tfidf)
In [61]:
# train the  TF/IDF Random Access algorithm
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_train_tfidf, y_train_tfidf)
y_pred_tfidf = regressor.predict(x_valid_tfidf)
In [62]:
#Evaluating the Algorithm
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid_tfidf, y_pred_tfidf))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid_tfidf, y_pred_tfidf))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid_tfidf, y_pred_tfidf)))
Mean Absolute Error: 0.06883136965821589
Mean Squared Error: 0.042317774509399206
Root Mean Squared Error: 0.20571284478466387
In [63]:
#Using Random Forest for Classification evalaute the TF/IDF
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
score_tfidf = accuracy_score(y_valid_tfidf, y_pred_tfidf.round())
print(confusion_matrix(y_valid_tfidf,y_pred_tfidf.round()))
print(classification_report(y_valid_tfidf,y_pred_tfidf.round()))
print(score_tfidf)
[[8761  180]
 [ 324  324]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      8941
           1       0.64      0.50      0.56       648

    accuracy                           0.95      9589
   macro avg       0.80      0.74      0.77      9589
weighted avg       0.94      0.95      0.94      9589

0.9474397747418918
In [64]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
In [65]:
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')
In [66]:
#Using Bag-of-Words Features
# Fitting the Logistic Regression Model

Log_Reg.fit(x_train_bow,y_train_bow)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:939: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[66]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [67]:
# The first part of the list is predicting probabilities for label:0 
# and the second part of the list is predicting probabilities for label:1
prediction_bow = Log_Reg.predict_proba(x_valid_bow)

prediction_bow
Out[67]:
array([[9.89419398e-01, 1.05806023e-02],
       [9.99976392e-01, 2.36082665e-05],
       [9.30185103e-01, 6.98148968e-02],
       ...,
       [9.94381665e-01, 5.61833456e-03],
       [9.95403477e-01, 4.59652291e-03],
       [9.82671726e-01, 1.73282743e-02]])
In [68]:
#Calculating the F1 score
from sklearn.metrics import f1_score
In [69]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
prediction_int = prediction_bow[:,1]>=0.3

prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_bow = f1_score(y_valid_bow, prediction_int)

log_bow
Out[69]:
0.5569811320754717
In [70]:
#Using TF-IDF Features
Log_Reg.fit(x_train_tfidf,y_train_tfidf)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:939: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[70]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [71]:
prediction_tfidf = Log_Reg.predict_proba(x_valid_tfidf)

prediction_tfidf
Out[71]:
array([[9.91185094e-01, 8.81490639e-03],
       [9.98810715e-01, 1.18928481e-03],
       [9.36992757e-01, 6.30072425e-02],
       ...,
       [9.99966354e-01, 3.36464857e-05],
       [9.78537853e-01, 2.14621474e-02],
       [9.99991675e-01, 8.32539848e-06]])
In [72]:
#Calculating the F1 score
prediction_int = prediction_tfidf[:,1]>=0.3

prediction_int = prediction_int.astype(np.int)
prediction_int

# calculating f1 score
log_tfidf = f1_score(y_valid_tfidf, prediction_int)

log_tfidf
Out[72]:
0.5551497443389335
In [73]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
In [74]:
dct = DecisionTreeClassifier(criterion='entropy', random_state=1)
In [75]:
#Using Bag-of-Words Features
dct.fit(x_train_bow,y_train_bow)
Out[75]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
In [76]:
dct.score(x_train_bow,y_train_bow)
Out[76]:
0.991820497921602
In [77]:
dct_bow = dct.predict_proba(x_valid_bow)

dct_bow
Out[77]:
array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])
In [78]:
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dct_bow=dct_bow[:,1]>=0.3

# converting the results to integer type
dct_int_bow=dct_bow.astype(np.int)

# calculating f1 score
dct_score_bow=f1_score(y_valid_bow,dct_int_bow)

dct_score_bow
Out[78]:
0.5141776937618148
In [79]:
#Using TF-IDF Features
dct.fit(x_train_tfidf,y_train_tfidf)
Out[79]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')
In [80]:
#find probability
dct_tfidf = dct.predict_proba(x_valid_tfidf)

dct_tfidf
Out[80]:
array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])
In [81]:
#Calculating F1 Score
# if prediction is greater than or equal to 0.3 than 1 else 0
# Where 0 is for positive sentiment tweets and 1 for negative sentiment tweets
dct_tfidf=dct_tfidf[:,1]>=0.3

# converting the results to integer type
dct_int_tfidf=dct_tfidf.astype(np.int)

# calculating f1 score
dct_score_tfidf=f1_score(y_valid_tfidf,dct_int_tfidf)

dct_score_tfidf
Out[81]:
0.5498821681068342
In [82]:
#Model Comparison
Algo=['LogisticRegression(Bag-of-Words)','RandomForest(Bag-of-Words)','DecisionTree(Bag-of-Words)','LogisticRegression(TF-IDF)','RandomForest(TF-IDF)','DecisionTree(TF-IDF)']
In [83]:
score = [log_bow,score_bow,dct_score_bow,log_tfidf,score_tfidf,dct_score_tfidf]

compare=pd.DataFrame({'Model':Algo,'F1_Score':score},index=[i for i in range(1,7)])
In [84]:
compare.T
Out[84]:
1 2 3 4 5 6
Model LogisticRegression(Bag-of-Words) RandomForest(Bag-of-Words) DecisionTree(Bag-of-Words) LogisticRegression(TF-IDF) RandomForest(TF-IDF) DecisionTree(TF-IDF)
F1_Score 0.556981 0.933465 0.514178 0.55515 0.94744 0.549882
In [85]:
plt.figure(figsize=(18,5))

sns.pointplot(x='Model',y='F1_Score',data=compare)

plt.title('Model Vs Score')
plt.xlabel('MODEL')
plt.ylabel('SCORE')

plt.show()
In [86]:
import plotly.express as px

dff = compare
fig = px.line(dff, x="Model", y="F1_Score")
fig.show()
In [87]:
sns.countplot(train_original['label'])
sns.despine()
In [ ]:
 
In [ ]:
 
In [ ]: